#4. Hierarchical Cluster Analysis #add libraries and source docs and stuff library(vegan) library(permute) library(cluster) library(pvclust) envdata<-read.csv('env.csv',header=TRUE,row.names=1) speabu<-read.csv('abund.csv',header=TRUE,row.names=1) ###compute the dissimilarity/distance matrix #standardize the env data first (since measurements on different scales) envdata.tra<-data.stand(envdata,method='standardize',margin='column',plot=F) #calculate Euclidean distance site.eucd<-vegdist(envdata.tra,method='euclidean') ###compute hierarchical clustering #this one is hierarchical agllomerative on dissimilarity (can do a lot of options: complete, average, median etc) sitecl.com<-hclust(site.eucd,method='complete') sitecl.ave<-hclust(site.eucd,method='average') sitecl.ward<-hclust(site.eucd,method='ward') #now look at them.. #print a summary hclus.table(sitecl.ave) #plot a dendrogram plot(sitecl.ave) #make it fancier plot(sitecl.ave,main='Average-linkage Dendrogram', xlab='Sites', ylab='Euclidean distance', hang=-1) ##DO IT AGAIN WITH A DIFFERENT DATASET - species abundance #standardize pre/abs speocc<-data.trans(speabu[,-1],method='power',exp=0,plot=F) #calc coefficient (Jaccards) sp.jacd<-vegdist(speocc,method='jaccard') #calc the clusters (average) specl.ave<-hclust(sp.jacd,method='average') #plot it plot(specl.ave) #calc cluster using single linkage sitecl.sin<-hclust(site.eucd,method="single") ##DECIDE ON NUMBER OF CLUSTERS TO RETAIN #first go back and look at plot of env. data plot(sitecl.ave) #now tell it that you want 9 clusters (outlined in red) rect.hclust(sitecl.ave,k=9) #or you can decide based on height (distance) #note: replot it again or it just starts overlapping crap plot(sitecl.ave) rect.hclust(sitecl.ave,h=6) #based on the analysis we now will 'cut' the tree into clusters (this creates a vector 'sitecl.class' that contains the cluster membership..which will be used in next analysis) sitecl.class<-cutree(sitecl.ave,k=9) #I just want to look at it to make sure there are 9 clusters sitecl.class ##CLUSTER DIAGNOSTICS evaluating the cluster solution #compute the agglomeration coefficient (>.75 considered good) coef.hclust(sitecl.ave) coef.hclust(sitecl.com) #or compute the cophenetic coorelation coefficient cor(site.eucd,cophenetic(sitecl.ave)) #can also plot the cophenetic correlation coefficient hclus.cophenetic(site.eucd,sitecl.ave) #scree plot-looks at dissimiliarity between the clusters, usually the 'elbow' is a good place to 'cut' the dendrogram hclus.scree(sitecl.sin) hclus.scree(sitecl.ave) #Looking at the Stability of Clusters using bootstrapping #BOOTSTRAP test: do your clusters remain intact if you take a random subsampling of your objects with replacement? This function (usually used in DNA analysis) does that. Note: the 't' function to transpose the data because the function assume objects are columns (not rows), and only running 50 bootstraps (nboot) to save computational time clus.stab<-pvclust(t(envdata.tra),method.hclust='average',method.dist='euclidean', nboot=50) #now look at plot and highlight sig. clusters at a critical p<0.01 plot(clus.stab) pvrect(clus.stab,alpha=0.99) #once we examine clusters we may want to describe DIFFERENCES AMONG CLUSTERS #in particular explore univariate differences among clusters #1 option is to bind clusters - here cluster classes (9 of them, from above) are 'bound' envdata.new<-cbind(sitecl.class,envdata) #the cluster (1-9) is now the first column of the matrix #Now use boxplots to infer differences .. box.plots(envdata.new,by='sitecl.class') #..orA 2nd option is to use a nonparametric Kruskall-Wallis ranktest clus.stats(envdata.new,sitecl.class) #Extra things and different libraries, methods #use agnes() instaed of hclus sitecl.agnes<-agnes(envdata.tra, metric='euclidean', method='complete') plot(sitecl.agnes) #hierarchical divisive clustering sitec13.dia<-diana(site.eucd) #monothetic hierarchical divisive clustering #i.e. using a single descriptor as the basis for the partitioning of the data at each step #then use mona() function sitec14.mona<-mona(speocc)